Importing needed packages

library(ape)
library(dendextend)
library(cluster)
library(tibble)
library(magrittr)
library(dplyr)
library(phytools)
library(mltools)
library(data.table)
library(factoextra)
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/convert_to_parenthesis.R")
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/cv_score.R")
library(tictoc)
library(mvMORPH)
library(tidyr)
library(MASS)
library(ggplot2)
library(ggthemes)
library(ggpubr)
library(gridBase)
library(grid)

Here we have the interest of comparing the scatterplot, dendrogram and score of each hierarchical clustering method in order to do a proof of consent regarding our score, i.e., verify that our score makes sense in hierarchical clustering context. For that, me simulate the data with \(n = 100\) sample points:

set.seed(500)
n = 100
p = 2

Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5))
mu_0 = c(0, 0)
mu_1 = c(2, 2)
S = diag(nrow = 2, ncol = 2)


X = matrix(nrow = n, ncol = p)
X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4)
X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4)

sim.data = as.data.frame(cbind(X , Y))
sim.data$Y = as.factor(Y)
colnames(sim.data) = c("X1", "X2", "Y")
row.names(sim.data) = c(1:nrow(sim.data))
head(sim.data, 4)

Simulating the “wrong” data:

X_3 = data.frame(X3 = rexp(n, 1))
head(X_3)

Making the ploting function:

plot_all_scores = function(original.data, cons.data, cl.list, dists = NA){
  list.names = names(cl.list)
  l = length(list.names)
  for(k in 1:l){
    hc.func = list.names[k]
    methods = cl.list[[hc.func]]
    m = length(methods)
    if(is.na(dists)){
      if(m > 0){
      for(c in 1:m){
        cons.hc = get(hc.func)(dist(scale(cons.data)), method = methods[c])
        cons.dend = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.dend, original.data[, -3])
        original.hc = get(hc.func)(dist(scale(original.data[, -3])), method = methods[c])
        original.dend = convert_to_phylo(original.hc)
        score.original = L_score(original.dend, original.data[, -3])
        p1.original = fviz_dend(original.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.original = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        p1.cons = fviz_dend(cons.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.cons = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        t_grob1 = text_grob(paste0(hc.func,".", methods[c],
        " original data score: ", round(score.original, 4)), size = 12)
        t_grob2 = text_grob(paste0(hc.func,".", methods[c],
        " wrong data score: ", round(score.cons, 4)), size = 12)
        plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
        plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
        g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
                       ncol = 2), nrow = 2, heights = c(1,5))
        g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
                       ncol = 2), nrow = 2, heights = c(1,5))
        show(g1)
        show(g2)
      }
      }else{
        cons.hc = get(hc.func)(dist(scale(cons.data)))
        cons.dend = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.dend, original.data[, -3])
        original.hc = get(hc.funct)(dist(scale(original.data[, -3])))
        original.dend = convert_to_phylo(original.hc)
        score.original = L_score(original.dend, original.data[, -3])
        p1.original = fviz_dend(original.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.original = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        p1.cons = fviz_dend(cons.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.cons = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        t_grob1 = text_grob(paste0(hc.func," original data score: ", 
                                   round(score.original, 4)))
        t_grob2 = text_grob(paste0(hc.func, " wrong data score: ", 
                                   round(score.cons, 4)))
        plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
        plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
        g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
                       ncol = 2), nrow = 2, heights = c(1,5))
        g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
                       ncol = 2), nrow = 2, heights = c(1,5))
        show(g1)
        show(g2)
      }
    }else{
        dists.length = length(dists)
        for(h in 1:dists.length){
        if(m > 0){
        for(c in (1:m)){
        cons.hc = get(hc.func)(dist(scale(cons.data), method = dists[h]), method = methods[c])
        cons.dend = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.dend, original.data[, -3])
        original.hc = get(hc.func)(dist(scale(original.data[, -3]), method = dists[h]), 
                             method = methods[c])
        original.dend = convert_to_phylo(original.hc)
        score.original = L_score(original.dend, original.data[, -3])
        p1.original = fviz_dend(original.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.original = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        p1.cons = fviz_dend(cons.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.cons = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        t_grob1 =  text_grob(paste0(hc.func,".", methods[c],
          " original data with ",dists[h]," distance score: ",
                                                           round(score.original, 4)))
        t_grob2 = text_grob(paste0(hc.func,".", methods[c],
         " wrong data with ", dists[h], " distance score: ", round(score.cons, 4)))
        plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
        plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
        g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
                       ncol = 2), nrow = 2, heights = c(1,5))
        g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
                       ncol = 2), nrow = 2, heights = c(1,5))
        show(g1)
        show(g2)
      }
        }else{
        cons.hc = get(hc.func)(dist(scale(cons.data), method = dists[h]))
        cons.dend = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.dend, original.data[, -3])
        original.hc = hclust(dist(scale(original.data[, -3]), method = dists[h]))
        original.dend = convert_to_phylo(original.hc)
        score.original = L_score(original.dend, original.data[, -3])
        p1.original = fviz_dend(original.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.original = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        p1.cons = fviz_dend(cons.hc, k = 2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        p2.cons = temp_data %>%
          ggplot(aes(x = X1, y = X2, colour = labels)) +
          geom_point() +
          labs(x = "X1",
                y = "X2",
                colour = "Labels") +
          scale_colour_brewer(palette = "Set1") +
          theme_minimal()
        t_grob1 = text_grob(paste0(hc.func, " original data score: ", 
                                   round(score.original, 4)))
        t_grob2 =  text_grob(paste0(hc.func, " wrong data score: ", round(score.cons, 4)))
        plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
        plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
        g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
                       ncol = 2), nrow = 2, heights = c(1,5))
        g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
                       ncol = 2), nrow = 2, heights = c(1,5))
        show(g1)
        show(g2)
  }
  
        }
    }
  }
}

And now a list of all clustering methods and distances of interest:

dists = c("euclidean", "manhattan", "canberra")
test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median", "centroid"), agnes = c("weighted", "complete", "average", "ward"), diana = NULL)
plot_all_scores(sim.data, X_3, test.list, dists)


Another interesting test to do is plot the dendrogram with the real labels than the chosen clusters and compare it to the same scatterplot. For that, we have the following example:

par(mfrow = c(1, 2), mar = c(2, 1.5, 2, 2))
hc.func = hclust(dist(scale(sim.data[, -3])), method = "ward.D2")
dend = as.dendrogram(hc.func)
phylo = convert_to_phylo(hc.func)
order = as.numeric(phylo$tip.label)
labels = sim.data$Y[order]
colors = ifelse(labels == 1, "red", "blue")
dend2 = assign_values_to_leaves_edgePar(dend = dend, value = colors, edgePar = "col")
labels = cutree(hc.func, k = 2)
temp_data = sim.data[, -3]
temp_data$labels = factor(labels)
# plotando
plot(dend2)
plot(sim.data$X1, sim.data$X2, col = temp_data$labels, main = "Test title")


Generalizing that for the above list of clustering methods:

plot_all_scores_2 = function(original.data, cons.data, cl.list, dists = NA){
  list.names = names(cl.list)
  l = length(list.names)
  par(mfrow = c(1, 2), mar = c(2, 1.5, 2, 2))
  for(k in 1:l){
    hc.func = list.names[k]
    methods = cl.list[[hc.func]]
    m = length(methods)
    if(is.na(dists)){
      if(m > 0){
      for(c in 1:m){
        # determining values of interest
        # cons
        cons.hc = get(hc.func)(dist(scale(cons.data)), method = methods[c])
        cons.dend = as.dendrogram(cons.hc)
        cons.phylo = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.phylo, original.data[, -3])
        order = as.numeric(cons.phylo$tip.label)
        cons.labels = original.data$Y[order]
        colors = ifelse(cons.labels == 1, "red", "blue")
        cons.dend2 = assign_values_to_leaves_edgePar(dend = cons.dend, value = 
                                                  colors, edgePar = "col")
        
        # original
        original.hc = get(hc.func)(dist(scale(original.data[, -3])), method = methods[c])
        original.dend = as.dendrogram(original.hc)
        original.phylo = convert_to_phylo(original.hc)
        score.original = L_score(original.phylo, original.data[, -3])
        order = as.numeric(original.phylo$tip.label)
        original.labels = original.data$Y[order]
        colors = ifelse(original.labels == 1, "red", "blue")
        original.dend2 = assign_values_to_leaves_edgePar(dend = original.dend, value = 
                                                  colors, edgePar = "col")
        
        
        # plotting for original
        plot(original.dend2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main = paste0(hc.func,".", methods[c], 
                      "original data \n score: ", round(score.original, 4)))
        
        # plotting for fake
        plot(cons.dend2)
        factors = factor(cutree(cons.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main = paste0(hc.func,".", methods[c],
                          "wrong data  \n score: ", round(score.cons, 4)))
        
      }
      }else{
        
        # determining values of interest
        # cons
        cons.hc = get(hc.func)(dist(scale(cons.data)))
        cons.dend = as.dendrogram(cons.hc)
        cons.phylo = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.phylo, original.data[, -3])
        order = as.numeric(cons.phylo$tip.label)
        cons.labels = original.data$Y[order]
        colors = ifelse(cons.labels == 1, "red", "blue")
        cons.dend2 = assign_values_to_leaves_edgePar(dend = cons.dend, value = 
                                                  colors, edgePar = "col")
        
        # original
        original.hc = get(hc.func)(dist(scale(original.data[, -3])))
        original.dend = as.dendrogram(original.hc)
        original.phylo = convert_to_phylo(original.hc)
        score.original = L_score(original.phylo, original.data[, -3])
        order = as.numeric(original.phylo$tip.label)
        original.labels = original.data$Y[order]
        colors = ifelse(original.labels == 1, "red", "blue")
        original.dend2 = assign_values_to_leaves_edgePar(dend = original.dend, value = 
                                                  colors, edgePar = "col")
        
        
        # plotting for original
        plot(original.dend2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main = paste0(hc.func,
                              " original data \n score: ", round(score.original, 4)))
        
        # plotting for fake
        plot(cons.dend2)
        factors = factor(cutree(cons.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main = paste0(hc.func,
                              " wrong data \n score: ", round(score.cons, 4)))
        
      }
    }else{
        dists.length = length(dists)
        for(h in 1:dists.length){
        if(m > 0){
        for(c in (1:m)){
        # determining values of interest
        # cons
        cons.hc = get(hc.func)(dist(scale(cons.data), method = dists[h]), method = methods[c])
        cons.dend = as.dendrogram(cons.hc)
        cons.phylo = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.phylo, original.data[, -3])
        order = as.numeric(cons.phylo$tip.label)
        cons.labels = original.data$Y[order]
        colors = ifelse(cons.labels == 1, "red", "blue")
        cons.dend2 = assign_values_to_leaves_edgePar(dend = cons.dend, value = 
                                                  colors, edgePar = "col")
        
        # original
        original.hc = get(hc.func)(dist(scale(original.data[, -3]), method = dists[h]), 
                                        method = methods[c])
        original.dend = as.dendrogram(original.hc)
        original.phylo = convert_to_phylo(original.hc)
        score.original = L_score(original.phylo, original.data[, -3])
        order = as.numeric(original.phylo$tip.label)
        original.labels = original.data$Y[order]
        colors = ifelse(original.labels == 1, "red", "blue")
        original.dend2 = assign_values_to_leaves_edgePar(dend = original.dend, value = 
                                                  colors, edgePar = "col")
        
        # plotting for original
        plot(original.dend2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main = paste0(hc.func,".", methods[c],
                            " original data with \n",dists[h]," distance score: ",
                              round(score.original, 4)))
        
        # plotting for fake
        plot(cons.dend2)
        factors = factor(cutree(cons.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main  = paste0(hc.func,".", methods[c],
                  " wrong data with \n", dists[h], " distance score: ", round(score.cons, 4)))
        
      }
        }else{
          
          
          # determining values of interest
        # cons
        cons.hc = get(hc.func)(dist(scale(cons.data), method = dists[h]))
        cons.dend = as.dendrogram(cons.hc)
        cons.phylo = convert_to_phylo(cons.hc)
        score.cons =  L_score(cons.phylo, original.data[, -3])
        order = as.numeric(cons.phylo$tip.label)
        cons.labels = original.data$Y[order]
        colors = ifelse(cons.labels == 1, "red", "blue")
        cons.dend2 = assign_values_to_leaves_edgePar(dend = cons.dend, value = 
                                                  colors, edgePar = "col")
        
        # original
        original.hc = get(hc.func)(dist(scale(original.data[, -3]), method = dists[h]))
        original.dend = as.dendrogram(original.hc)
        original.phylo = convert_to_phylo(original.hc)
        score.original = L_score(original.phylo, original.data[, -3])
        order = as.numeric(original.phylo$tip.label)
        original.labels = original.data$Y[order]
        colors = ifelse(original.labels == 1, "red", "blue")
        original.dend2 = assign_values_to_leaves_edgePar(dend = original.dend, value = 
                                                  colors, edgePar = "col")
          
        
        # plotting for original
        plot(original.dend2)
        factors = factor(cutree(original.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main = paste0(hc.func, 
                        " original data \n score: ", round(score.original, 4)))
        
        # plotting for fake
        plot(cons.dend2)
        factors = factor(cutree(cons.hc, k = 2))
        temp_data = original.data[, -3]
        temp_data$labels = factors
        plot(temp_data$X1, temp_data$X2, col = temp_data$labels, 
             main  = paste0(hc.func,".", methods[c],
                  " wrong data with \n", dists[h], " distance score: ", round(score.cons, 4)))
        

  }
  
        }
    }
  }
}
dists = c("euclidean", "manhattan", "canberra")
test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median", "centroid"), agnes = c("weighted", "complete", "average", "ward"), diana = NULL)
plot_all_scores_2(sim.data, X_3, test.list, dists)